Data import
#install.packages("tidymodels")
#install.packages("themis")
library(here)
library(tidyverse)
library(ggplot2)
library(dplyr)
library(tensorflow)
library(tfdatasets)
library(tidymodels)
library(keras)
library(caret)
library(themis)
#LOAD DATA
setwd(getwd())
dataIn = "../Data/Dataset-part-2.csv"
data_in <- read.csv(dataIn,header = TRUE, sep =',')
#View(data_in)
data <- data.frame(data_in)
summary(data)
ID CODE_GENDER FLAG_OWN_CAR FLAG_OWN_REALTY CNT_CHILDREN AMT_INCOME_TOTAL
Min. :5008804 Length:67614 Length:67614 Length:67614 Min. : 0.0000 Min. : 26100
1st Qu.:5465941 Class :character Class :character Class :character 1st Qu.: 0.0000 1st Qu.: 112500
Median :5954270 Mode :character Mode :character Mode :character Median : 0.0000 Median : 157500
Mean :5908133 Mean : 0.4206 Mean : 178867
3rd Qu.:6289080 3rd Qu.: 1.0000
Max. :7965248 Max. :19.0000 Max. :6750000
NAME_INCOME_TYPE NAME_EDUCATION_TYPE NAME_FAMILY_STATUS NAME_HOUSING_TYPE DAYS_BIRTH DAYS_EMPLOYED
Length:67614 Length:67614 Length:67614 Length:67614 Min. :-25201 Min. :-17531
Class :character Class :character Class :character Class :character 1st Qu.:-19438 1st Qu.: -2886
Mode :character Mode :character Mode :character Mode :character Median :-15592 Median : -1305
Mean :-15914 Mean : 62022
3rd Qu.:-12347 3rd Qu.: -321
Max. : -7489 Max. :365243
FLAG_MOBIL FLAG_WORK_PHONE FLAG_PHONE FLAG_EMAIL OCCUPATION_TYPE CNT_FAM_MEMBERS
Min. :1 Min. :0.0000 Min. :0.0000 Min. :0.0000 Length:67614 Min. : 1.000
1st Qu.:1 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000 Class :character 1st Qu.: 2.000
Median :1 Median :0.0000 Median :0.0000 Median :0.0000 Mode :character Median : 2.000
Mean :1 Mean :0.2028 Mean :0.2742 Mean :0.1005 Mean : 2.174
3rd Qu.:1 3rd Qu.:0.0000 3rd Qu.:1.0000 3rd Qu.:0.0000 3rd Qu.: 3.000
Max. :1 Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :20.000
status
Length:67614
Class :character
Mode :character
plot(data$status)

##Cleanup
# Check for duplicates
sum(duplicated(data))
[1] 0
# No duplicates
#Remove ID (irrelevant) and FLAG_MOBIL (always 1)
data <- data %>% select(-ID, -FLAG_MOBIL)
cols <- c("CODE_GENDER","FLAG_OWN_CAR","FLAG_OWN_REALTY","NAME_INCOME_TYPE","NAME_EDUCATION_TYPE", "NAME_FAMILY_STATUS", "NAME_HOUSING_TYPE","FLAG_WORK_PHONE","FLAG_PHONE","FLAG_EMAIL", "OCCUPATION_TYPE","status")
cols
[1] "CODE_GENDER" "FLAG_OWN_CAR" "FLAG_OWN_REALTY" "NAME_INCOME_TYPE"
[5] "NAME_EDUCATION_TYPE" "NAME_FAMILY_STATUS" "NAME_HOUSING_TYPE" "FLAG_WORK_PHONE"
[9] "FLAG_PHONE" "FLAG_EMAIL" "OCCUPATION_TYPE" "status"
data[cols] <- lapply(data[cols],factor)
# Replacing empty values with "Unknown"
levels(data$OCCUPATION_TYPE) <- c(levels(data$OCCUPATION_TYPE), "Unknown")
data$OCCUPATION_TYPE[is.na(data$OCCUPATION_TYPE)] <- "Unknown"
# Replacing C and X in Status
levels(data$status)[levels(data$status)=="C"] <- "6"
#data$status[data$status == "X"] <- 7
levels(data$status)[levels(data$status)=="X"] <- "7"
# #Convert factors into numericals
# data %<>% mutate_if(is.factor, as.numeric)
summary(data)
CODE_GENDER FLAG_OWN_CAR FLAG_OWN_REALTY CNT_CHILDREN AMT_INCOME_TOTAL NAME_INCOME_TYPE
F:43924 N:43107 N:21090 Min. : 0.0000 Min. : 26100 Commercial associate:15640
M:23690 Y:24507 Y:46524 1st Qu.: 0.0000 1st Qu.: 112500 Pensioner :11982
Median : 0.0000 Median : 157500 State servant : 5044
Mean : 0.4206 Mean : 178867 Student : 4
3rd Qu.: 1.0000 3rd Qu.: 225000 Working :34944
Max. :19.0000 Max. :6750000
NAME_EDUCATION_TYPE NAME_FAMILY_STATUS NAME_HOUSING_TYPE
Academic degree : 38 Civil marriage : 6016 Co-op apartment : 227
Higher education :16890 Married :44906 House / apartment :60307
Incomplete higher : 2306 Separated : 4125 Municipal apartment: 2303
Lower secondary : 716 Single / not married: 9528 Office apartment : 587
Secondary / secondary special:47664 Widow : 3039 Rented apartment : 1020
With parents : 3170
DAYS_BIRTH DAYS_EMPLOYED FLAG_WORK_PHONE FLAG_PHONE FLAG_EMAIL OCCUPATION_TYPE CNT_FAM_MEMBERS
Min. :-25201 Min. :-17531 0:53904 0:49071 0:60819 Unknown :20699 Min. : 1.000
1st Qu.:-19438 1st Qu.: -2886 1:13710 1:18543 1: 6795 Laborers :12425 1st Qu.: 2.000
Median :-15592 Median : -1305 Sales staff: 6899 Median : 2.000
Mean :-15914 Mean : 62022 Core staff : 6059 Mean : 2.174
3rd Qu.:-12347 3rd Qu.: -321 Managers : 4950 3rd Qu.: 3.000
Max. : -7489 Max. :365243 Drivers : 4427 Max. :20.000
(Other) :12155
status
0 :52133
1 : 6491
7 : 5790
6 : 1805
2 : 712
5 : 374
(Other): 309